library(hexbin)
mdata = read.csv("movie_rt_transform.csv")
head(mdata)
## X gross genres
## 1 1 760505847 Action|Adventure|Fantasy|Sci-Fi
## 2 2 309404152 Action|Adventure|Fantasy
## 3 4 448130642 Action|Thriller
## 4 6 73058679 Action|Adventure|Sci-Fi
## 5 7 336530303 Action|Adventure|Romance
## 6 8 200807262 Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance
## movie_title
## 1 Avatar
## 2 Pirates of the Caribbean: At World's End
## 3 The Dark Knight Rises
## 4 John Carter
## 5 Spider-Man 3
## 6 Tangled
## movie_imdb_link budget
## 1 http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1 237000000
## 2 http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 300000000
## 3 http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1 250000000
## 4 http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1 263700000
## 5 http://www.imdb.com/title/tt0413300/?ref_=fn_tt_tt_1 258000000
## 6 http://www.imdb.com/title/tt0398286/?ref_=fn_tt_tt_1 260000000
## title_year imdb_score Action Adventure Animation Biography Comedy Crime
## 1 2009 7.9 1 1 0 0 0 0
## 2 2007 7.1 1 1 0 0 0 0
## 3 2012 8.5 1 0 0 0 0 0
## 4 2012 6.6 1 1 0 0 0 0
## 5 2007 6.2 1 1 0 0 0 0
## 6 2010 7.8 0 1 1 0 1 0
## Documentary Drama Family Fantasy History Horror Music Musical Mystery
## 1 0 0 0 1 0 0 0 0 0
## 2 0 0 0 1 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 1 1 0 0 0 1 0
## Romance Sci.Fi Sport Thriller War Western rt_score t_gross t_budget
## 1 0 1 0 0 0 0 83 660.2562 492.3032
## 2 0 0 0 0 0 0 45 526.5075 522.4296
## 3 0 0 0 1 0 0 87 577.9841 498.9734
## 4 0 1 0 0 0 0 51 365.8094 505.7269
## 5 1 0 0 0 0 0 63 537.7714 502.9498
## 6 1 0 0 0 0 0 89 472.1621 503.9294
## t_imdb_score t_rt_score
## 1 7.9 0.94133219
## 2 7.1 -0.12441070
## 3 8.5 1.10923983
## 4 6.6 0.02482065
## 5 6.2 0.32844628
## 6 7.8 1.20624569
hist(mdata$gross, main="Histogram of Gross", xlab="Gross")
hist(mdata$budget, main="Histogram of Budget", xlab="Budget")
hist(mdata$imdb_score, main="Histogram of IMDB Scores", xlab="IMDB Score")
hist(mdata$rt_score, main="Histogram of Rotten Tomato Scores", xlab="Rotten Tomato Score")
t_rt_score = qnorm((mdata$rt_score+0.5)/101,0)
hist(t_rt_score)
plot(hexbin(mdata$gross, mdata$budget), main="Gross vs. Budget" , legend=0, xlab="Gross (USD)", ylab="Budget (USD)")
plot(hexbin(mdata$imdb_score, mdata$rt_score), main="IMDB Score vs. RT Score" , legend=0, xlab="IMDB Score", ylab="RT Score")
plot(hexbin(log(mdata$gross), log(mdata$budget)), main="Gross vs. Budget" , legend=0, xlab="log gross", ylab="log budget")
mdata3 = mdata
mdata3$norm_gross = mdata$gross/max(mdata$gross)
mdata3$norm_budget = mdata$budget/max(mdata$budget)
mdata3$norm_imdb_score = mdata$imdb_score/max(mdata$imdb_score)
mdata3$norm_rt_score = mdata$rt_score/max(mdata$rt_score)
boxplot(mdata3[,c(35, 36, 37, 38)], main="Univariate Summary")
First the untransformed scatter plot.
mdata2 = mdata
hexplom(~mdata2[,c(2,6,8,30)])
Then transform gross and profit by log transform.
mdata2$log_gross = log(mdata2$gross)
mdata2$log_budget = log(mdata2$budget)
hexplom(~mdata2[,c(36,35,8,30)])
model = lm(mdata$gross~mdata$budget+mdata$imdb_score+mdata$rt_score)
summary(model)
##
## Call:
## lm(formula = mdata$gross ~ mdata$budget + mdata$imdb_score +
## mdata$rt_score)
##
## Residuals:
## Min 1Q Median 3Q Max
## -229627824 -26241779 -7469321 16632442 468357060
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.390e+07 1.054e+07 -4.167 3.29e-05 ***
## mdata$budget 1.103e+00 2.880e-02 38.309 < 2e-16 ***
## mdata$imdb_score 5.625e+06 2.020e+06 2.784 0.00544 **
## mdata$rt_score 3.628e+05 7.613e+04 4.765 2.09e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54490000 on 1325 degrees of freedom
## Multiple R-squared: 0.5611, Adjusted R-squared: 0.5601
## F-statistic: 564.7 on 3 and 1325 DF, p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")
model = lm(log(mdata$gross)~log(mdata$budget)+mdata$imdb_score+mdata$rt_score)
summary(model)
##
## Call:
## lm(formula = log(mdata$gross) ~ log(mdata$budget) + mdata$imdb_score +
## mdata$rt_score)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4637 -0.5004 0.1653 0.7933 9.0791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.633252 0.534917 -3.053 0.00231 **
## log(mdata$budget) 1.055704 0.028726 36.751 < 2e-16 ***
## mdata$imdb_score 0.004319 0.056501 0.076 0.93908
## mdata$rt_score 0.010511 0.002146 4.898 1.09e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.524 on 1325 degrees of freedom
## Multiple R-squared: 0.5162, Adjusted R-squared: 0.5151
## F-statistic: 471.3 on 3 and 1325 DF, p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")
Try exact transformation.
library(car)
## Warning: package 'car' was built under R version 3.3.2
ans <- powerTransform(cbind(mdata$gross, mdata$budget,
mdata$imdb_score, mdata$rt_score+.01) ~ 1)
powerTransform(cbind(mdata$gross, mdata$budget,
mdata$imdb_score, mdata$rt_score+.01) ~ 1)
## Estimated transformation parameters
## Y1 Y2 Y3 Y4
## 0.2953805 0.2549329 2.4030884 0.7905505
summary(ans)
## bcPower Transformations to Multinormality
## Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## Y1 0.2954 0.0109 0.2740 0.3167
## Y2 0.2549 0.0140 0.2276 0.2823
## Y3 2.4031 0.1096 2.1882 2.6180
## Y4 0.7906 0.0371 0.7179 0.8632
##
## Likelihood ratio tests about transformation parameters
## LRT df pval
## LR test, lambda = (0 0 0 0) 2827.280 4 0
## LR test, lambda = (1 1 1 1) 3936.344 4 0
## LR test, lambda = (0.3 0.25 2.4 0.79) 0.000 4 1
gross_trans = (mdata$gross^0.295 - 1)/0.295
budget_trans = (mdata$budget^0.255 - 1)/0.255
imdb_trans = (mdata$imdb_score^2.4 - 1)/2.4
rt_trans = (mdata$rt_score^.79 - 1)/.79
model = lm(gross_trans~budget_trans+imdb_trans+rt_trans)
hexplom(~cbind(gross_trans,budget_trans,imdb_trans,rt_trans))
summary(model)
##
## Call:
## lm(formula = gross_trans ~ budget_trans + imdb_trans + rt_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -573.78 -102.85 -0.81 100.33 705.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -130.90218 18.04851 -7.253 6.92e-13 ***
## budget_trans 1.87940 0.04392 42.793 < 2e-16 ***
## imdb_trans 1.38693 0.49153 2.822 0.00485 **
## rt_trans 2.26168 0.51584 4.384 1.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 159.4 on 1325 degrees of freedom
## Multiple R-squared: 0.6042, Adjusted R-squared: 0.6033
## F-statistic: 674.2 on 3 and 1325 DF, p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")
Try approximate.
t_gross = (mdata$gross^0.25 - 1)/0.25
t_budget = (mdata$budget^0.25 - 1)/0.25
t_imdb_1 = mdata$imdb_score
t_imdb_2 = (mdata$imdb_score^2 - 1)/2
t_rt = mdata$rt_score
model1 = lm(t_gross~t_budget+t_imdb_1+t_imdb_2+t_rt)
hexplom(~cbind(t_gross,t_budget,t_imdb_2,t_rt))
summary(model1)
##
## Call:
## lm(formula = t_gross ~ t_budget + t_imdb_1 + t_imdb_2 + t_rt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -284.00 -45.49 2.59 46.17 338.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.09001 44.27268 3.639 0.000285 ***
## t_budget 0.95733 0.02243 42.690 < 2e-16 ***
## t_imdb_1 -62.27125 14.35158 -4.339 1.54e-05 ***
## t_imdb_2 11.49206 2.47120 4.650 3.64e-06 ***
## t_rt 0.43296 0.11042 3.921 9.26e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.66 on 1324 degrees of freedom
## Multiple R-squared: 0.6037, Adjusted R-squared: 0.6025
## F-statistic: 504.3 on 4 and 1324 DF, p-value: < 2.2e-16
plot(model1$fitted, model1$resid, main="residual plot", xlab="Fitted", ylab="Residuals")
Normal transform of rotten tomatoes score.
ans <- powerTransform(cbind(mdata$gross, mdata$budget, mdata$imdb_score, t_rt_score-min(t_rt_score)+.01) ~ 1)
summary(ans)
## bcPower Transformations to Multinormality
## Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## Y1 0.2952 0.0109 0.2739 0.3165
## Y2 0.2543 0.0139 0.2270 0.2816
## Y3 2.4339 0.1108 2.2167 2.6510
## Y4 0.9676 0.0467 0.8761 1.0590
##
## Likelihood ratio tests about transformation parameters
## LRT df pval
## LR test, lambda = (0 0 0 0) 2995.474884 4 0.0000000
## LR test, lambda = (1 1 1 1) 3899.397058 4 0.0000000
## LR test, lambda = (0.3 0.25 2.43 1) 0.480411 4 0.9753797
t_gross = (mdata$gross^0.25 - 1)/0.25
t_budget = (mdata$budget^0.25 - 1)/0.25
t_imdb_1 = mdata$imdb_score
t_imdb_2 = (mdata$imdb_score^2 - 1)/2
t_rt = t_rt_score
model2 = lm(t_gross~t_budget+t_imdb_1+t_imdb_2+t_rt)
hexplom(~cbind(t_gross,t_budget,t_imdb_2,t_rt))
summary(model2)
##
## Call:
## lm(formula = t_gross ~ t_budget + t_imdb_1 + t_imdb_2 + t_rt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -283.68 -45.24 2.62 46.14 338.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 187.0276 43.6062 4.289 1.92e-05 ***
## t_budget 0.9579 0.0224 42.763 < 2e-16 ***
## t_imdb_1 -63.1530 14.2745 -4.424 1.05e-05 ***
## t_imdb_2 11.5402 2.4501 4.710 2.74e-06 ***
## t_rt 14.6784 3.4910 4.205 2.79e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.6 on 1324 degrees of freedom
## Multiple R-squared: 0.6044, Adjusted R-squared: 0.6032
## F-statistic: 505.7 on 4 and 1324 DF, p-value: < 2.2e-16
plot(hexbin(model2$fitted, model2$resid), main="Residual Plot", xlab="Fitted", ylab="Residuals", legend=0)
par(mfrow=c(1,1))
plot(hexbin(t_gross, t_budget), main="Gross vs. Budget" , legend=0, xlab="Transformed Gross", ylab="Transformed Budget")
rt=mdata
genre_old=rt[,c(9:29)]
Music= as.numeric(genre_old$Music | genre_old$Musical)
genre=cbind(genre_old[,-c(13:14)],Music)
data_initial=rt[,c(2,6,8,10:30)]
data_initial=cbind(data_initial,as.factor(rt$title_year))
lm1=lm(gross~.,data=data_initial)
summary(lm1)
##
## Call:
## lm(formula = gross ~ ., data = data_initial)
##
## Residuals:
## Min 1Q Median 3Q Max
## -236630412 -26076779 -7207365 16453032 460557534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.838e+07 1.291e+07 -4.521 6.72e-06 ***
## budget 1.097e+00 4.062e-02 27.001 < 2e-16 ***
## imdb_score 8.658e+06 2.159e+06 4.011 6.40e-05 ***
## Adventure -7.292e+06 5.172e+06 -1.410 0.15884
## Animation 5.613e+06 7.699e+06 0.729 0.46610
## Biography 5.138e+06 7.348e+06 0.699 0.48450
## Comedy 8.833e+06 4.102e+06 2.153 0.03147 *
## Crime -6.446e+06 4.799e+06 -1.343 0.17940
## Documentary -1.827e+07 1.093e+07 -1.672 0.09470 .
## Drama -1.121e+07 3.901e+06 -2.872 0.00414 **
## Family -4.552e+06 6.022e+06 -0.756 0.44989
## Fantasy -7.261e+05 4.776e+06 -0.152 0.87918
## History -7.774e+06 1.020e+07 -0.762 0.44618
## Horror 6.477e+06 5.873e+06 1.103 0.27030
## Music 1.009e+07 7.389e+06 1.365 0.17239
## Musical -2.715e+06 1.039e+07 -0.261 0.79394
## Mystery -1.456e+06 5.668e+06 -0.257 0.79738
## Romance 8.988e+05 4.117e+06 0.218 0.82724
## Sci.Fi 6.495e+06 5.093e+06 1.275 0.20244
## Sport -8.764e+06 7.761e+06 -1.129 0.25900
## Thriller -8.084e+05 4.422e+06 -0.183 0.85498
## War 7.136e+06 9.634e+06 0.741 0.45901
## Western -1.459e+07 1.529e+07 -0.954 0.34037
## rt_score 3.673e+05 7.802e+04 4.707 2.78e-06 ***
## `as.factor(rt$title_year)`2007 -3.844e+06 6.807e+06 -0.565 0.57239
## `as.factor(rt$title_year)`2008 -3.283e+06 6.523e+06 -0.503 0.61488
## `as.factor(rt$title_year)`2009 1.060e+06 6.516e+06 0.163 0.87085
## `as.factor(rt$title_year)`2010 -3.376e+06 6.608e+06 -0.511 0.60949
## `as.factor(rt$title_year)`2011 -7.857e+06 6.692e+06 -1.174 0.24055
## `as.factor(rt$title_year)`2012 3.803e+06 6.644e+06 0.572 0.56719
## `as.factor(rt$title_year)`2013 -5.225e+06 6.712e+06 -0.779 0.43638
## `as.factor(rt$title_year)`2014 3.953e+06 6.754e+06 0.585 0.55845
## `as.factor(rt$title_year)`2015 2.968e+06 7.045e+06 0.421 0.67357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54160000 on 1296 degrees of freedom
## Multiple R-squared: 0.5759, Adjusted R-squared: 0.5655
## F-statistic: 55 on 32 and 1296 DF, p-value: < 2.2e-16
trans1=read.csv("movie_rt_transform.csv")
continious_trans1=trans1[,-c(1:30)]
continious_trans1=cbind(continious_trans1,continious_trans1$t_imdb_score^2,as.factor(rt$title_year))
names(continious_trans1)[c(5,6)]=c("t_imdb_score2","year")
library(corrplot)
## cluster into 3 groups
corrplot(cor(genre),order="hclust",addrect=3,method="color")
c3_1=as.numeric(genre$Comedy|genre$Romance|genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #8
c3_2=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c3_3=as.numeric(genre$Western|genre$Crime|genre$Thriller|genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery) #7
c3=cbind(c3_1,c3_2,c3_3)
colSums(c3)
## c3_1 c3_2 c3_3
## 906 157 672
## cluster into 4 groups
corrplot(cor(genre),order="hclust",addrect=4,method="color")
c4_1=as.numeric(genre$Comedy|genre$Romance) #2
c4_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c4_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c4_4=as.numeric(genre$Western|genre$Crime|genre$Thriller|genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery) #7
c4=cbind(c4_1,c4_2,c4_3,c4_4)
## cluster into 5 groups
corrplot(cor(genre),order="hclust",addrect=5,method="color")
c5_1=as.numeric(genre$Comedy|genre$Romance) #2
c5_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c5_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c5_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c5_5=as.numeric(genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery) #7
c5=cbind(c5_1,c5_2,c5_3,c5_4,c5_5)
## cluster into 6 groups
corrplot(cor(genre),order="hclust",addrect=6,method="color")
c6_1=as.numeric(genre$Comedy|genre$Romance) #2
c6_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c6_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c6_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c6_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c6_6=as.numeric(genre$Horror|genre$Mystery) #2
c6=cbind(c6_1,c6_2,c6_3,c6_4,c6_5,c6_6)
## cluster into 7 groups
corrplot(cor(genre),order="hclust",addrect=7,method="color")
c7_1=as.numeric(genre$Comedy|genre$Romance) #2
c7_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family) #6
c7_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c7_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c7_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c7_6=as.numeric(genre$Horror|genre$Mystery) #2
c7_7=as.numeric(genre$Documentary|genre$Music)
c7=cbind(c7_1,c7_2,c7_3,c7_4,c7_5,c7_6,c7_7)
data_c3=cbind(continious_trans1,c3,rt$title_year)
lm_c3=lm(t_gross~.,data=as.data.frame(data_c3))
s3=summary(lm_c3)
plot(lm_c3$fitted.values,lm_c3$residuals)
data_c4=cbind(continious_trans1,c4)
lm_c4=lm(t_gross~.,data=as.data.frame(data_c4))
s4=summary(lm_c4)
data_c5=cbind(continious_trans1,c5)
lm_c5=lm(t_gross~.,data=as.data.frame(data_c5))
s5=summary(lm_c5)
data_c6=cbind(continious_trans1,c6)
lm_c6=lm(t_gross~.,data=as.data.frame(data_c6))
s6=summary(lm_c6)
data_c7=cbind(continious_trans1,c7)
lm_c7=lm(t_gross~.,data=as.data.frame(data_c7))
s7=summary(lm_c7)
data_all=cbind(continious_trans1,genre_old)
lm_all=lm(t_gross~.,data=as.data.frame(data_all))
s_all=summary(lm_all)
bic=BIC(lm_c3,lm_c4,lm_c5,lm_c6,lm_c7,lm_all)
BIC suggests c6
We calculated cook’s distance for the model (shown as below). The maximum cook’s distance is around 0.025, which is quite small.
##
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
##
## logit, vif
The range for standardized residuals is:
## [1] -4.629635 4.674346
## fitted(lm_c6) movie_title t_gross t_budget t_imdb_score
## 349 380.58399 Winter's Tale 44.9631 348.04469 6.2
## 1298 65.02993 Paranormal Activity 403.6925 40.26728 6.3
## t_rt_score studres stdres
## 349 -1.1092398 -4.666226 -4.629635
## 1298 0.9413322 4.712052 4.674346
##
## Call:
## lm(formula = t_gross ~ . + D1, data = as.data.frame(data_c6))
##
## Residuals:
## Min 1Q Median 3Q Max
## -325.56 -43.96 -0.32 46.11 209.93
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.989e+03 1.448e+03 -3.446 0.000587 ***
## t_budget 9.887e-01 2.603e-02 37.982 < 2e-16 ***
## t_imdb_score -7.571e+01 1.403e+01 -5.396 8.09e-08 ***
## t_rt_score 1.280e+01 3.445e+00 3.715 0.000212 ***
## t_imdb_score2 7.139e+00 1.209e+00 5.905 4.48e-09 ***
## `movie_rt_transform[, 7]` 2.577e+00 7.213e-01 3.572 0.000367 ***
## c6_1 1.643e+01 4.862e+00 3.380 0.000746 ***
## c6_2 -4.156e+00 4.718e+00 -0.881 0.378492
## c6_3 -1.990e+01 6.662e+00 -2.987 0.002873 **
## c6_4 -3.453e+00 5.135e+00 -0.672 0.501388
## c6_5 -7.184e+00 5.374e+00 -1.337 0.181539
## c6_6 2.338e+01 6.038e+00 3.871 0.000114 ***
## D1 3.402e+02 7.323e+01 4.645 3.73e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72.7 on 1316 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6232
## F-statistic: 184 on 12 and 1316 DF, p-value: < 2.2e-16
It turns out to be significant; it is an outlier.
Dummy viriable for Winter’s Tale: (it is significant; this movie is an outlier).
##
## Call:
## lm(formula = t_gross ~ . + D2, data = as.data.frame(data_c6))
##
## Residuals:
## Min 1Q Median 3Q Max
## -259.68 -44.65 0.22 45.59 335.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5044.9415 1449.1108 -3.481 0.000515 ***
## t_budget 0.9829 0.0260 37.802 < 2e-16 ***
## t_imdb_score -73.7175 14.0340 -5.253 1.75e-07 ***
## t_rt_score 12.6463 3.4481 3.668 0.000255 ***
## t_imdb_score2 6.9949 1.2091 5.785 9.05e-09 ***
## `movie_rt_transform[, 7]` 2.6017 0.7219 3.604 0.000325 ***
## c6_1 16.8378 4.8671 3.460 0.000558 ***
## c6_2 -3.5078 4.7235 -0.743 0.457848
## c6_3 -19.7999 6.6659 -2.970 0.003029 **
## c6_4 -4.7369 5.1323 -0.923 0.356203
## c6_5 -7.0889 5.3762 -1.319 0.187538
## c6_6 27.0472 6.0530 4.468 8.56e-06 ***
## D2 -330.1750 73.2520 -4.507 7.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72.73 on 1316 degrees of freedom
## Multiple R-squared: 0.6262, Adjusted R-squared: 0.6228
## F-statistic: 183.7 on 12 and 1316 DF, p-value: < 2.2e-16
It turns out to be significant; it is an outlier.
##
## Call:
## lm(formula = t_gross ~ ., data = as.data.frame(data_c6), subset = (cook <
## 0.015))
##
## Residuals:
## Min 1Q Median 3Q Max
## -258.602 -44.241 0.003 45.607 206.804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.218e+03 1.438e+03 -3.629 0.000295 ***
## t_budget 9.918e-01 2.584e-02 38.384 < 2e-16 ***
## t_imdb_score -8.160e+01 1.434e+01 -5.689 1.57e-08 ***
## t_rt_score 1.253e+01 3.434e+00 3.649 0.000273 ***
## t_imdb_score2 7.616e+00 1.226e+00 6.210 7.11e-10 ***
## `movie_rt_transform[, 7]` 2.698e+00 7.161e-01 3.768 0.000172 ***
## c6_1 1.751e+01 4.827e+00 3.628 0.000297 ***
## c6_2 -3.781e+00 4.689e+00 -0.806 0.420210
## c6_3 -1.970e+01 6.609e+00 -2.980 0.002933 **
## c6_4 -3.737e+00 5.093e+00 -0.734 0.463251
## c6_5 -7.705e+00 5.331e+00 -1.445 0.148611
## c6_6 2.547e+01 6.009e+00 4.239 2.40e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72.1 on 1314 degrees of freedom
## Multiple R-squared: 0.6317, Adjusted R-squared: 0.6286
## F-statistic: 204.9 on 11 and 1314 DF, p-value: < 2.2e-16
library(censReg)
## Loading required package: maxLik
## Loading required package: miscTools
## Warning: package 'miscTools' was built under R version 3.3.2
##
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
##
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
movie_rt_transform = mdata
continious_trans1=movie_rt_transform[,-c(1:30)]
continious_trans1=cbind(continious_trans1,continious_trans1$t_imdb_score^2,movie_rt_transform[,7])
names(continious_trans1)[5]="t_imdb_score2"
names(continious_trans1)[6]="year"
continious_trans1 = rbind(continious_trans1, 0)
continious_trans1$year = as.factor(continious_trans1$year)
genre_old=movie_rt_transform[,c(9:29)]
Music= as.numeric(genre_old$Music | genre_old$Musical)
genre=cbind(genre_old[,-c(13:14)],Music)
c6_1=as.numeric(genre$Comedy|genre$Romance) #2
c6_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c6_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c6_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c6_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c6_6=as.numeric(genre$Horror|genre$Mystery) #2
c6=cbind(c6_1,c6_2,c6_3,c6_4,c6_5,c6_6)
c6 = rbind(c6,0)
data_c6=cbind(continious_trans1,c6)
lm_c6=lm(data_c6$t_gross~.,data=as.data.frame(data_c6))
summary(lm_c6)
##
## Call:
## lm(formula = data_c6$t_gross ~ ., data = as.data.frame(data_c6))
##
## Residuals:
## Min 1Q Median 3Q Max
## -335.62 -45.60 0.31 46.17 338.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.289e-12 7.322e+01 0.000 1.000000
## t_budget 9.866e-01 2.627e-02 37.556 < 2e-16 ***
## t_imdb_score -7.399e+01 1.415e+01 -5.230 1.97e-07 ***
## t_rt_score 1.394e+01 3.484e+00 4.002 6.64e-05 ***
## t_imdb_score2 6.939e+00 1.219e+00 5.692 1.55e-08 ***
## year2006 1.862e+02 8.455e+01 2.202 0.027817 *
## year2007 1.774e+02 8.465e+01 2.096 0.036285 *
## year2008 1.803e+02 8.458e+01 2.132 0.033191 *
## year2009 1.819e+02 8.471e+01 2.148 0.031921 *
## year2010 1.847e+02 8.476e+01 2.179 0.029490 *
## year2011 1.831e+02 8.467e+01 2.162 0.030791 *
## year2012 1.990e+02 8.481e+01 2.347 0.019074 *
## year2013 1.991e+02 8.477e+01 2.349 0.018978 *
## year2014 2.073e+02 8.474e+01 2.447 0.014543 *
## year2015 1.905e+02 8.484e+01 2.246 0.024896 *
## c6_1 1.653e+01 4.904e+00 3.371 0.000772 ***
## c6_2 -4.686e+00 4.761e+00 -0.984 0.325241
## c6_3 -2.049e+01 6.748e+00 -3.036 0.002445 **
## c6_4 -4.309e+00 5.179e+00 -0.832 0.405524
## c6_5 -6.840e+00 5.433e+00 -1.259 0.208275
## c6_6 2.546e+01 6.101e+00 4.172 3.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.22 on 1309 degrees of freedom
## Multiple R-squared: 0.625, Adjusted R-squared: 0.6192
## F-statistic: 109.1 on 20 and 1309 DF, p-value: < 2.2e-16
clm_s6=censReg(formula = data_c6$t_gross~data_c6$t_budget+data_c6$t_imdb_score+data_c6$t_rt_score+
data_c6$t_imdb_score2+data_c6$year+data_c6$c6_1+data_c6$c6_2+
data_c6$c6_3+data_c6$c6_4+data_c6$c6_5+data_c6$c6_6)
summary(clm_s6)
##
## Call:
## censReg(formula = data_c6$t_gross ~ data_c6$t_budget + data_c6$t_imdb_score +
## data_c6$t_rt_score + data_c6$t_imdb_score2 + data_c6$year +
## data_c6$c6_1 + data_c6$c6_2 + data_c6$c6_3 + data_c6$c6_4 +
## data_c6$c6_5 + data_c6$c6_6)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 1330 1 1329 0
##
## Coefficients:
## Estimate Std. error t value Pr(> t)
## (Intercept) -231.20984 809.64485 -0.286 0.775208
## data_c6$t_budget 0.98660 0.02607 37.842 < 2e-16 ***
## data_c6$t_imdb_score -73.96865 14.03893 -5.269 1.37e-07 ***
## data_c6$t_rt_score 13.94225 3.45777 4.032 5.53e-05 ***
## data_c6$t_imdb_score2 6.93755 1.20982 5.734 9.79e-09 ***
## data_c6$year2006 417.36904 810.73131 0.515 0.606689
## data_c6$year2007 408.58067 810.74172 0.504 0.614290
## data_c6$year2008 411.47693 810.73384 0.508 0.611779
## data_c6$year2009 413.09014 810.74759 0.510 0.610389
## data_c6$year2010 415.86186 810.75224 0.513 0.607998
## data_c6$year2011 414.22537 810.74352 0.511 0.609407
## data_c6$year2012 430.19563 810.75734 0.531 0.595689
## data_c6$year2013 430.27326 810.75374 0.531 0.595621
## data_c6$year2014 438.50428 810.75073 0.541 0.588603
## data_c6$year2015 421.68073 810.76120 0.520 0.602991
## data_c6$c6_1 16.52826 4.86652 3.396 0.000683 ***
## data_c6$c6_2 -4.68529 4.72530 -0.992 0.321426
## data_c6$c6_3 -20.48595 6.69707 -3.059 0.002221 **
## data_c6$c6_4 -4.30921 5.14010 -0.838 0.401834
## data_c6$c6_5 -6.84014 5.39199 -1.269 0.204592
## data_c6$c6_6 25.45859 6.05539 4.204 2.62e-05 ***
## logSigma 4.28595 0.01940 220.966 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Newton-Raphson maximisation, 16 iterations
## Return code 2: successive function values within tolerance limit
## Log-likelihood: -7581.796 on 22 Df
The results for our covariates of interest are very similar.